anti_join() on stop_wordsThe CRAN Task View: Natural Language Processing (NLP) lists over 100 different resources, including 58 packages, focused on gathering, organizing, modeling, and analyzing text.
There are multiple ways to organize text for analysis:
We will focus on Tidy text format (ttf): A table (tibble) with one token per row where a token is a meaningful unit of text such as a word, n-gram (multiple words), a sentence, a paragraph, on up to whole chapters or books.
Organizing single words or N-grams without any sense of order, is often referred to by the term “Bag of Words” as each token is treated independently of the other tokens in the document; only the counts or tf-idfs matter.
More sophisticated methods are now using neural word embeddings where the words are encoded into vectors that attempt to capture (get trained upon) the context from other (near/far) words in the document. Word2Vec and Google’s BERT are two examples See Beyond Word Embeddings Part 2.
Read in the following text
text <- c("If You Forget Me",
"by Pablo Neruda",
"I want you to know",
"one thing.",
"You know how this is:",
"if I look",
"at the crystal moon, at the red branch",
"of the slow autumn at my window,",
"if I touch",
"near the fire",
"the impalpable ash",
"or the wrinkled body of the log,",
"everything carries me to you,",
"as if everything that exists,",
"aromas, light, metals,",
"were little boats",
"that sail",
"toward those isles of yours that wait for me."
)
text
## [1] "If You Forget Me"
## [2] "by Pablo Neruda"
## [3] "I want you to know"
## [4] "one thing."
## [5] "You know how this is:"
## [6] "if I look"
## [7] "at the crystal moon, at the red branch"
## [8] "of the slow autumn at my window,"
## [9] "if I touch"
## [10] "near the fire"
## [11] "the impalpable ash"
## [12] "or the wrinkled body of the log,"
## [13] "everything carries me to you,"
## [14] "as if everything that exists,"
## [15] "aromas, light, metals,"
## [16] "were little boats"
## [17] "that sail"
## [18] "toward those isles of yours that wait for me."Turn it into a tibble with a variable for the line and one for the text
text_df <- tibble(
line = 1:length(text),
text = text
)
text_df
## # A tibble: 18 x 2
## line text
## <int> <chr>
## 1 1 If You Forget Me
## 2 2 by Pablo Neruda
## 3 3 I want you to know
## 4 4 one thing.
## 5 5 You know how this is:
## 6 6 if I look
## 7 7 at the crystal moon, at the red branch
## 8 8 of the slow autumn at my window,
## 9 9 if I touch
## 10 10 near the fire
## 11 11 the impalpable ash
## 12 12 or the wrinkled body of the log,
## 13 13 everything carries me to you,
## 14 14 as if everything that exists,
## 15 15 aromas, light, metals,
## 16 16 were little boats
## 17 17 that sail
## 18 18 toward those isles of yours that wait for me.unnest()text_df is not in tidy text format so use unnest_tokens() to convert
text_df %>%
unnest_tokens(word, text)
## # A tibble: 80 x 2
## line word
## <int> <chr>
## 1 1 if
## 2 1 you
## 3 1 forget
## 4 1 me
## 5 2 by
## 6 2 pablo
## 7 2 neruda
## 8 3 i
## 9 3 want
## 10 3 you
## # … with 70 more rowsBy default, unnest_tokens() converts the tokens to lowercase. (Use to_lower = TRUE to retain case).
anti_join() on stop_wordsstop_words, a tidytext data frame of 1149 stopwords based on three different lexiconsSave to a new tibble
data(stop_words)
text_df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>% # get rid of uninteresting words
count(word, sort = TRUE) -> # count of each word left
text_word_count
text_word_count # note: only 26 rows instead of 80
## # A tibble: 26 x 2
## word n
## <chr> <int>
## 1 aromas 1
## 2 ash 1
## 3 autumn 1
## 4 boats 1
## 5 body 1
## 6 branch 1
## 7 carries 1
## 8 crystal 1
## 9 exists 1
## 10 fire 1
## # … with 16 more rowsGet the data from the janeaustenr package. You may need to install the package.
library(janeaustenr)austen_books() to access the data frame of the books which has two columns:
text contains the text of the novels divided into elements of up to about 70 characters eachbook contains the titles of the novels as a factor in order of publication.Group by book, add row numbers, find the chapters, and save to a new data_frame with chapter, linenumber, and text.
austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
select(chapter, linenumber, everything()) ->
orig_books
orig_books
## # A tibble: 73,422 x 4
## chapter linenumber text book
## <int> <int> <chr> <fct>
## 1 0 1 "SENSE AND SENSIBILITY" Sense & Sensibility
## 2 0 2 "" Sense & Sensibility
## 3 0 3 "by Jane Austen" Sense & Sensibility
## 4 0 4 "" Sense & Sensibility
## 5 0 5 "(1811)" Sense & Sensibility
## 6 0 6 "" Sense & Sensibility
## 7 0 7 "" Sense & Sensibility
## 8 0 8 "" Sense & Sensibility
## 9 0 9 "" Sense & Sensibility
## 10 1 10 "CHAPTER 1" Sense & Sensibility
## # … with 73,412 more rowsLook at the counts
orig_books %>%
unnest_tokens(word, text) %>%
# use str_extract for the words as part of the encoding
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words) ->
tidy_books
tidy_books %>%
count(word, sort = TRUE)
## # A tibble: 13,464 x 2
## word n
## <chr> <int>
## 1 miss 1860
## 2 time 1339
## 3 fanny 862
## 4 dear 822
## 5 lady 819
## 6 sir 807
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # … with 13,454 more rowsPlot the most common words in total in descending order
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 400) %>%
mutate(word = reorder(word,n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
AFINN from Finn Arup Nielsen,bing from Bing Liu and collaboratorsnrc from Saif Mohammad and Peter Turneysentiments %>% arrange(word)
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
unique(get_sentiments("nrc")$sentiment)
## [1] "trust" "fear" "negative" "sadness" "anger"
## [6] "surprise" "positive" "disgust" "joy" "anticipation"
Get the Jane Austen books into tidy text format
austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
# use `word` so the inner_join will match with the nrc lexicon
unnest_tokens(word, text) ->
tidy_books
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # … with 725,045 more rowsUse an inner-join() to select the words in Emma that are “fear” words
nrcfear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrcfear) %>%
count(word, sort = TRUE)
## # A tibble: 364 x 2
## word n
## <chr> <int>
## 1 doubt 98
## 2 ill 72
## 3 afraid 65
## 4 marry 63
## 5 change 61
## 6 bad 60
## 7 feeling 56
## 8 bear 52
## 9 creature 39
## 10 obliging 34
## # … with 354 more rowsNotice it is not always clear why a word is a “fear” word.
Consider using scales = "free_x" in facet_wrap()
How many words are associated with the other sentiments in nrc?
get_sentiments("nrc") %>%
group_by(sentiment) %>%
count()
## # A tibble: 10 x 2
## # Groups: sentiment [10]
## sentiment n
## <chr> <int>
## 1 anger 1247
## 2 anticipation 839
## 3 disgust 1058
## 4 fear 1476
## 5 joy 689
## 6 negative 3324
## 7 positive 2312
## 8 sadness 1191
## 9 surprise 534
## 10 trust 1231tidy_books and use the bing lexicon to categorize each word as positive or negative. + Recall the words in tidy_books are in sequential order by line number
inner_join() to filter out words not in bing and add the sentiment columnindex = line_number %/% 80 in count()Add a column with the net = positive - negative
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>%
mutate(sentiment = positive - negative) ->
janeaustensentiment
janeaustensentiment %>%
ggplot(aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
We should probably look at which words contribute to the positive and negative sentiment and be sure we want to include them as part of the sentiment.
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() ->
bing_word_counts
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rowsLet’s plot the top ten for each sentiment
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
Not what we want for Jane Austen novels!! Miss is probably not a negative word, but rather refers to a young girl.
Change the sentiment lexicon to no longer have “miss” as a negative.
Remove miss from the text by adding to stop words and repeat
custom_stop_words <- bind_rows(data_frame(
word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
# Now, let's redo with the new stop words.
austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
# use word so the inner_join will match with the nrc lexicon
unnest_tokens(word, text) %>%
anti_join(custom_stop_words) ->
tidy_books_no_miss
tidy_books_no_miss %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() ->
bing_word_counts
bing_word_counts
## # A tibble: 2,554 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 happy positive 534
## 2 love positive 495
## 3 pleasure positive 462
## 4 poor negative 424
## 5 happiness positive 369
## 6 comfort positive 292
## 7 doubt negative 281
## 8 affection positive 272
## 9 perfectly positive 271
## 10 glad positive 263
## # … with 2,544 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
Remove the word “miss” from the bing sentiment lexicon.
get_sentiments("bing") %>%
filter(word != "miss") ->
bing_no_miss
tidy_books %>%
inner_join(bing_no_miss) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup() ->
bing_word_counts
bing_word_counts
## # A tibble: 2,584 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 well positive 1523
## 2 good positive 1380
## 3 great positive 981
## 4 like positive 725
## 5 better positive 639
## 6 enough positive 613
## 7 happy positive 534
## 8 love positive 495
## 9 pleasure positive 462
## 10 poor negative 424
## # … with 2,574 more rows
# visualize it
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
Original and No Miss
# Original
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>%
mutate(sentiment = positive - negative) ->
janeaustensentiment
janeaustensentiment %>%
ggplot(aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
#No Miss
tidy_books %>%
inner_join(bing_no_miss) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n=0)) %>%
mutate(sentiment = positive - negative) ->
janeaustensentiment
janeaustensentiment %>%
ggplot(aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
The wordcloud package uses base R graphics to create Word Clouds
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
inner_join(bing_no_miss) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("red", "blue"),
max.words = 100)
Consider the ChatterPlot
Try a repeat of top 50 Jane Austen words by sentiment and books
library(ggrepel)
tidy_books %>%
inner_join(bing_no_miss) %>%
count(book, word, sentiment, sort = TRUE) %>%
mutate(proportion = n/sum(n)) %>%
group_by(sentiment) %>%
top_n(50) %>%
ungroup()->
tempp
tempp %>%
ggplot(aes(book, proportion, label = word )) +
# ggrepel geom, make arrows transparent, color by rank, size by n
geom_text_repel(segment.alpha = 0,
aes(colour=sentiment, size=proportion)) +
# set word size range & turn off legend
scale_size_continuous(range = c(3, 6), guide = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Top 50 Words by Sentiment in Each Book")
prideprejudice which has the complete text divided into elements of up to about 70 characters each.unnest_tokens() will collapse the entire input together before tokenizing unless collapse = FALSE.unnest_tokens() separates sentences at periods so get rid of periods after Mr., Mrs., and Dr. as a small clean up.
tibble(text = prideprejudice) %>%
mutate(chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]", ignore_case = TRUE))),
text = str_replace(text, "(Chapter \\d+)","\\1\\."),
text = str_replace_all(text, "((Mr)|(Mrs)|(Dr))\\.","\\1")) %>%
unnest_tokens(sentence, text, token = "sentences") ->
PandP_sentencesCreate a line plot of sentiment score by chapter to see a view of the story arc
PandP_sentences %>%
mutate(sentence_number = row_number()) %>%
unnest_tokens(word,sentence) %>%
inner_join(get_sentiments("bing")) %>%
filter(chapter >0) %>%
count(chapter, sentence_number, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
mutate(sentence_sent = positive - negative) %>%
mutate(sentence_sent = case_when(
sentence_sent >0 ~1,
sentence_sent == 0 ~0,
sentence_sent <0 ~ -1
)) %>%
group_by(chapter) %>%
summarize(chap_sent_per = sum(sentence_sent)/n()) %>%
ggplot(aes(chapter, chap_sent_per)) +
geom_line()+
ggtitle("Sentence Sentiment Score per Chapter") +
ylab("(Score/Total Sentences in a Chapter") +
xlab("Chapter") +
geom_hline(yintercept = 0, color = "red", alpha = .4, lty = 2) +
scale_x_continuous(limits = c(1,61)) +
geom_rug(sides = "b")
Chapter 36 appears to be the low point.
Take out the word “miss”
get_sentiments("bing") %>%
filter(sentiment == "negative") %>%
filter(word != "miss")->
bingnegative
tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n()) ->
wordcounts
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 156 3405 0.0458
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 161 3685 0.0437
## 4 Emma 16 81 1894 0.0428
## 5 Northanger Abbey 21 143 2982 0.0480
## 6 Persuasion 4 62 1807 0.0343